# Importing librries
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
import plotly.express as px
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pwd
'D:\\vu\\terms\\term 8\\Methods and Algorithm in ML- 1\\final-term'
os.chdir('D:\\vu\\terms\\term 8\\Methods and Algorithm in ML- 1\\final-term')
train= pd.read_csv("train_data.csv")
test= pd.read_csv("test_data.csv")
train.head()
| transaction_number | user_id | payment_method | partner_id | partner_category | country | device_type | money_transacted | transaction_initiation | partner_pricing_category | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 144703125000 | 17539344 | sbi_atm_cum_debit_card | 47334 | cat_1 | IND_INR | android_devices | -5.0 | 2016-11-15 19:16:12+00:00 | 2 | 0 |
| 1 | 77406814453032 | 24710841 | e_wallet_payments | 78890 | cat_2 | IND_INR | other_pcs | 100.0 | 2017-01-11 09:25:33+00:00 | 2 | 0 |
| 2 | 308929485482801 | 24265476 | e_wallet_payments | 78890 | cat_2 | IND_INR | other_pcs | 50.0 | 2016-12-07 07:58:09+00:00 | 2 | 0 |
| 3 | 665270027747073 | 10240000 | other_debit_cards | 102557 | cat_3 | IND_INR | other_pcs | 1000.0 | 2017-01-11 16:15:44+00:00 | 2 | 0 |
| 4 | 38276160171101 | 5880625 | other_debit_cards | 118335 | cat_1 | IND_INR | other_pcs | 200.0 | 2016-11-16 17:04:42+00:00 | 2 | 0 |
test.head()
| transaction_number | user_id | payment_method | partner_id | partner_category | country | device_type | money_transacted | transaction_initiation | partner_pricing_category | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 857050141038272 | 3802500 | visa_master_debit_cards | 118335 | cat_1 | IND_INR | other_pcs | 200.0 | 2016-12-28 14:44:37+00:00 | 2 |
| 1 | 4400462872603 | 200704 | e_wallet_payments | 23667 | cat_2 | IND_INR | other_pcs | 20.0 | 2016-12-14 17:49:48+00:00 | 2 |
| 2 | 1207915598569499 | 25150225 | sbi_atm_cum_debit_card | 47334 | cat_1 | IND_INR | android_devices | -100.0 | 2017-01-21 10:18:11+00:00 | 2 |
| 3 | 2009725616777536 | 21827584 | visa_master_credit_cards | 78890 | cat_2 | IND_INR | other_pcs | 200.0 | 2017-01-16 13:25:10+00:00 | 1 |
| 4 | 2319205210274863 | 576081 | e_wallet_payments | 23667 | cat_2 | IND_INR | other_pcs | 100.0 | 2017-02-09 12:47:28+00:00 | 2 |
train.shape, test.shape
((76529, 11), (19133, 10))
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 76529 entries, 0 to 76528 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 transaction_number 76529 non-null int64 1 user_id 76529 non-null int64 2 payment_method 76529 non-null object 3 partner_id 76529 non-null int64 4 partner_category 76529 non-null object 5 country 76529 non-null object 6 device_type 76529 non-null object 7 money_transacted 76529 non-null float64 8 transaction_initiation 76529 non-null object 9 partner_pricing_category 76529 non-null int64 10 is_fraud 76529 non-null int64 dtypes: float64(1), int64(5), object(5) memory usage: 6.4+ MB
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19133 entries, 0 to 19132 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 transaction_number 19133 non-null int64 1 user_id 19133 non-null int64 2 payment_method 19133 non-null object 3 partner_id 19133 non-null int64 4 partner_category 19133 non-null object 5 country 19133 non-null object 6 device_type 19133 non-null object 7 money_transacted 19133 non-null float64 8 transaction_initiation 19133 non-null object 9 partner_pricing_category 19133 non-null int64 dtypes: float64(1), int64(4), object(5) memory usage: 1.5+ MB
# Transaction number is unique and works as a id for each transaction.
label_train= train['transaction_number']
# Select all columns except "transaction_number"
train.drop(['transaction_number'], axis=1, inplace=True)
train.head(3)
| user_id | payment_method | partner_id | partner_category | country | device_type | money_transacted | transaction_initiation | partner_pricing_category | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17539344 | sbi_atm_cum_debit_card | 47334 | cat_1 | IND_INR | android_devices | -5.0 | 2016-11-15 19:16:12+00:00 | 2 | 0 |
| 1 | 24710841 | e_wallet_payments | 78890 | cat_2 | IND_INR | other_pcs | 100.0 | 2017-01-11 09:25:33+00:00 | 2 | 0 |
| 2 | 24265476 | e_wallet_payments | 78890 | cat_2 | IND_INR | other_pcs | 50.0 | 2016-12-07 07:58:09+00:00 | 2 | 0 |
label_test= test['transaction_number']
## Select all columns except "transaction_number"
test.drop(['transaction_number'], axis=1, inplace=True)
train.nunique()
user_id 3594 payment_method 6 partner_id 23 partner_category 9 country 1 device_type 4 money_transacted 1454 transaction_initiation 75811 partner_pricing_category 4 is_fraud 2 dtype: int64
test.nunique()
user_id 2617 payment_method 6 partner_id 20 partner_category 8 country 1 device_type 4 money_transacted 695 transaction_initiation 19090 partner_pricing_category 4 dtype: int64
train.describe()
| user_id | partner_id | money_transacted | partner_pricing_category | is_fraud | |
|---|---|---|---|---|---|
| count | 7.652900e+04 | 76529.000000 | 76529.000000 | 76529.000000 | 76529.000000 |
| mean | 1.247483e+07 | 58497.189105 | 132.724348 | 2.255707 | 0.002012 |
| std | 1.205878e+07 | 36740.216787 | 2350.110900 | 0.732174 | 0.044814 |
| min | 1.000000e+00 | 7889.000000 | -20000.000000 | 0.000000 | 0.000000 |
| 25% | 3.515625e+06 | 23667.000000 | -1.000000 | 2.000000 | 0.000000 |
| 50% | 9.753129e+06 | 47334.000000 | 20.000000 | 2.000000 | 0.000000 |
| 75% | 1.788444e+07 | 78890.000000 | 52.000000 | 2.000000 | 0.000000 |
| max | 5.592048e+07 | 213003.000000 | 197217.760000 | 4.000000 | 1.000000 |
print(train.corr())
# plotting correlation heatmap
dataplot = sns.heatmap(train.corr(), annot=True)
# displaying heatmap
plt.show()
user_id partner_id money_transacted \
user_id 1.000000 -0.037846 -0.045650
partner_id -0.037846 1.000000 0.092432
money_transacted -0.045650 0.092432 1.000000
partner_pricing_category 0.424626 -0.185584 -0.066396
is_fraud -0.000064 0.062991 0.570388
partner_pricing_category is_fraud
user_id 0.424626 -0.000064
partner_id -0.185584 0.062991
money_transacted -0.066396 0.570388
partner_pricing_category 1.000000 -0.034002
is_fraud -0.034002 1.000000
# Creating a column named fraud for visualization purposes
is_fraud_map = {0:'No',1:'Yes'}
train['fraud'] = train['is_fraud'].map(is_fraud_map)
# train.head(2)
# Money_transacted v/s user_id wrt to fraud
fig = px.scatter(train, x='user_id', y='money_transacted', color='fraud', hover_name= 'partner_id',
title="Money_transacted v/s user_id wrt to fraud")
fig.show()
# Count of different payment methods.
train.payment_method.value_counts()
sbi_atm_cum_debit_card 30538 e_wallet_payments 27384 visa_master_debit_cards 11643 other_debit_cards 4495 visa_master_credit_cards 2454 unified_payments_interface 15 Name: payment_method, dtype: int64
# Plot of payment_method wrt to the money_transacted
sns.catplot(y='money_transacted' , x='payment_method' ,
data = train.sort_values('money_transacted',ascending=False) ,
kind='boxen' , height=6 , aspect=3)
plt.show(True)
# Money_transacted v/s payment_method wrt to fraud
fig = px.scatter(train, x="money_transacted", y="payment_method", color="fraud",
title="Money_transacted v/s payment_method wrt to fraud")
fig.show()
train['partner_id'].value_counts()
47334 26105 23667 19526 78890 12273 118335 9546 7889 2317 31556 1892 165669 1216 86779 1121 149891 767 110446 640 102557 231 15778 189 157780 165 63112 159 189336 146 173558 89 55223 75 213003 25 39445 19 71001 15 126224 10 94668 2 181447 1 Name: partner_id, dtype: int64
# Plot of payment_method wrt to the money_transacted
sns.catplot(y='money_transacted' , x='partner_id' ,
data = train.sort_values('money_transacted',ascending=False) ,
kind='boxen' , height=6 , aspect=3)
plt.show(True)
# Money_transacted v/s payment_method wrt to fraud
fig = px.scatter(train, x="money_transacted", y="partner_id", color="fraud", hover_name= 'user_id',
title="Money_transacted v/s payment_method wrt to fraud")
fig.show()
train['partner_category'].value_counts()
cat_1 36306 cat_2 36019 cat_3 1536 cat_4 1310 cat_5 1026 cat_7 165 cat_6 146 cat_8 19 cat_9 2 Name: partner_category, dtype: int64
Category 1 and 2 make up for most of the transactions.
# Plot of payment_method wrt to the money_transacted
sns.catplot(y='money_transacted' , x='partner_category' ,
data = train.sort_values('money_transacted',ascending=False) ,
kind='boxen' , height=6 , aspect=3)
plt.show(True)
# Money_transacted v/s partner_category wrt to fraud
fig = px.scatter(train, x="money_transacted", y="partner_category", color="fraud",
title="Money_transacted v/s partner_category wrt to fraud")
fig.show()
train['country'].value_counts()
IND_INR 76529 Name: country, dtype: int64
train.drop(['country'], axis=1, inplace=True)
test.drop(['country'], axis=1, inplace=True)
train['device_type'].value_counts()
other_pcs 45560 android_devices 29662 ios_devices 876 windows_pcs 431 Name: device_type, dtype: int64
# Money_transacted v/s device_type wrt to fraud
fig = px.scatter(train, x="money_transacted", y="device_type", color="fraud",
title="Money_transacted v/s device_type wrt to fraud")
fig.show()
# Plot to show the density of frauds wrt the amount of money transacted
facet = sns.FacetGrid(train, hue="is_fraud",aspect=4)
facet.map(sns.kdeplot,'money_transacted',shade= True)
# facet.set(xlim=(0, train['money_transacted'].max()))
facet.add_legend()
plt.show()
facet = sns.FacetGrid(train, hue="is_fraud",aspect=4)
facet.map(sns.kdeplot,'money_transacted',shade= True)
facet.set(xlim=(0, train['money_transacted'].max()))
facet.add_legend()
plt.xlim(-750,700)
plt.show()
facet = sns.FacetGrid(train, hue="is_fraud",aspect=4)
facet.map(sns.kdeplot,'money_transacted',shade= True)
facet.set(xlim=(0, train['money_transacted'].max()))
facet.add_legend()
plt.xlim(700,max(train.money_transacted))
plt.show()
facet = sns.FacetGrid(train, hue="is_fraud",aspect=4)
facet.map(sns.kdeplot,'money_transacted',shade= True)
facet.add_legend()
plt.xlim(min(train.money_transacted),-750)
plt.show()
max(train.money_transacted)
197217.76
min(train.money_transacted)
-20000.0
-200 to 400 is the range were there are the lowest frauds so we can bin it as green,
-750 to -200 and 420 to 700 have moderate amount of frauds so we can bin it as yellow,
-20000.0 to -750 and 700 to 197217.76 are the amounts where almost all the payments are fradulent and so we can bin them as red.
# Binning to create a new column named money_bins:
def get_money_frauds(x):
if x in range(-200, 400):
return 'Green'
elif x in range(-750,-200):
return 'Yellow'
elif x in range(420,700):
return 'Yellow'
else:
return 'Red'
train['money_bins']=train['money_transacted'].apply(get_money_frauds)
# Box plot to detect outliers with respect to is_fraud column
fig = px.box(train, x='fraud',y="money_transacted")
fig.show()
# Converting it to datetime object and creating new columns
import datetime
from datetime import datetime, date
train['transaction_initiation']= pd.to_datetime(train['transaction_initiation'])
date=train['transaction_initiation'].dt.date
time= train['transaction_initiation'].dt.time
year=train['transaction_initiation'].dt.year
month= train['transaction_initiation'].dt.month
day= train['transaction_initiation'].dt.day
yearofday = train['transaction_initiation'].dt.dayofyear
hour=train['transaction_initiation'].dt.hour
minute=train['transaction_initiation'].dt.minute
second =train['transaction_initiation'].dt.second
train['transaction_date']= date
train['transaction_time']= time
train['transaction_year']= year
train['transaction_month']= month
train['transaction_day']= day
# train['transaction_yearofday']= yearofday
train['transaction_hour']= hour
train['transaction_minute']= minute
# train['transaction_second']= second
train.head(2)
| user_id | payment_method | partner_id | partner_category | device_type | money_transacted | transaction_initiation | partner_pricing_category | is_fraud | fraud | money_bins | transaction_date | transaction_time | transaction_year | transaction_month | transaction_day | transaction_hour | transaction_minute | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 17539344 | sbi_atm_cum_debit_card | 47334 | cat_1 | android_devices | -5.0 | 2016-11-15 19:16:12+00:00 | 2 | 0 | No | Green | 2016-11-15 | 19:16:12 | 2016 | 11 | 15 | 19 | 16 |
| 1 | 24710841 | e_wallet_payments | 78890 | cat_2 | other_pcs | 100.0 | 2017-01-11 09:25:33+00:00 | 2 | 0 | No | Green | 2017-01-11 | 09:25:33 | 2017 | 1 | 11 | 9 | 25 |
# Money_transacted v/s payment_method wrt to fraud
fig = px.scatter(train, x='transaction_initiation', y='fraud', hover_name='money_transacted')
fig.show()
train['partner_pricing_category'].value_counts()
2 63899 4 10833 1 1497 0 300 Name: partner_pricing_category, dtype: int64
# Plot of partner_pricing_category wrt to the money_transacted
sns.catplot(y='money_transacted' , x='partner_pricing_category' ,
data = train.sort_values('money_transacted',ascending=False) ,
kind='boxen' , height=6 , aspect=3)
plt.show(True)
train['is_fraud'].value_counts()
0 76375 1 154 Name: is_fraud, dtype: int64
# label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train["user_id"]= le.fit_transform(train["user_id"])
train["payment_method"] =le.fit_transform(train["payment_method"])
train["partner_id"] =le.fit_transform(train["partner_id"])
train['partner_category'] = le.fit_transform(train['partner_category'])
train["device_type"] =le.fit_transform(train["device_type"])
train['money_bins']= le.fit_transform(train['money_bins'])
# Dropping unnecessary columns
train.drop(columns=['transaction_initiation','transaction_date','transaction_time','fraud'],inplace=True)
train.head(2)
| user_id | payment_method | partner_id | partner_category | device_type | money_transacted | partner_pricing_category | is_fraud | money_bins | transaction_year | transaction_month | transaction_day | transaction_hour | transaction_minute | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2709 | 2 | 5 | 0 | 0 | -5.0 | 2 | 0 | 0 | 2016 | 11 | 15 | 19 | 16 |
| 1 | 3267 | 0 | 9 | 1 | 2 | 100.0 | 2 | 0 | 0 | 2017 | 1 | 11 | 9 | 25 |
# dealing with the datetime column: transaction_initiation
test['transaction_initiation']= pd.to_datetime(test['transaction_initiation'])
date= test['transaction_initiation'].dt.date
time= test['transaction_initiation'].dt.time
year=test['transaction_initiation'].dt.year
month= test['transaction_initiation'].dt.month
day= test['transaction_initiation'].dt.day
weekday= test['transaction_initiation'].dt.weekday
yearofday = test['transaction_initiation'].dt.dayofyear
hour=test['transaction_initiation'].dt.hour
minute=test['transaction_initiation'].dt.minute
second =test['transaction_initiation'].dt.second
test['transaction_date']= date
test['transaction_time']= time
test['transaction_year']= year
test['transaction_month']= month
test['transaction_day']= day
# test['transaction_yearofday']= yearofday
test['transaction_hour']= hour
test['transaction_minute']= minute
# test['transaction_second']= second
# creating the money_bins column
test['money_bins']=test['money_transacted'].apply(get_money_frauds)
# label encoding
test["user_id"]=le.fit_transform(test["user_id"])
test["payment_method"]=le.fit_transform(test["payment_method"])
test["partner_id"]=le.fit_transform(test["partner_id"])
test['partner_category'] = le.fit_transform(test['partner_category'])
test["device_type"]=le.fit_transform(test["device_type"])
test['money_bins']= le.fit_transform(test['money_bins'])
# Dropping unnecessary columns
test.drop(columns=['transaction_initiation', 'transaction_date','transaction_time'],inplace=True)
test.head(2)
| user_id | payment_method | partner_id | partner_category | device_type | money_transacted | partner_pricing_category | transaction_year | transaction_month | transaction_day | transaction_hour | transaction_minute | money_bins | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 778 | 5 | 13 | 0 | 2 | 200.0 | 2 | 2016 | 12 | 28 | 14 | 44 | 0 |
| 1 | 67 | 0 | 2 | 1 | 2 | 20.0 | 2 | 2016 | 12 | 14 | 17 | 49 | 0 |
train.columns
Index(['user_id', 'payment_method', 'partner_id', 'partner_category',
'device_type', 'money_transacted', 'partner_pricing_category',
'is_fraud', 'money_bins', 'transaction_year', 'transaction_month',
'transaction_day', 'transaction_hour', 'transaction_minute'],
dtype='object')
test.columns
Index(['user_id', 'payment_method', 'partner_id', 'partner_category',
'device_type', 'money_transacted', 'partner_pricing_category',
'transaction_year', 'transaction_month', 'transaction_day',
'transaction_hour', 'transaction_minute', 'money_bins'],
dtype='object')
# checking the dimensions of train and test data
train.shape, test.shape
((76529, 14), (19133, 13))
# Separating out the target variable for modelling
y = train['is_fraud']
y.head(3)
0 0 1 0 2 0 Name: is_fraud, dtype: int64
train.drop(columns=['is_fraud'],inplace=True)
from sklearn.tree import DecisionTreeClassifier
# create the decision tree with your hyperparaneters.
model = DecisionTreeClassifier()
# fit the model to start training.
model.fit(x_train, y_train)
# get the importance of the resulting features.
importances = model.feature_importances_
# create a data frame for visualization.
final_df = pd.DataFrame({"Features": x_train.columns, "importances":importances})
final_df.set_index("importances")
# sort in ascending order to better visualization.
final_df = final_df.sort_values( "importances")
# plot the feature imortances in bars.
final_df.plot.bar(color = "teal")
print(importances)
[4.40655046e-02 1.98160614e-02 8.56253823e-01 4.49848022e-04 2.32490118e-02 6.19427095e-03 4.99714802e-02]
x_train.columns
Index(['user_id', 'payment_method', 'money_transacted',
'partner_pricing_category', 'transaction_year', 'transaction_month',
'transaction_day'],
dtype='object')
new_train= train[['user_id', 'payment_method', 'money_transacted', 'partner_pricing_category', 'transaction_year',
'transaction_month', 'transaction_day']]
new_train.head()
| user_id | payment_method | money_transacted | partner_pricing_category | transaction_year | transaction_month | transaction_day | |
|---|---|---|---|---|---|---|---|
| 0 | 2709 | 2 | -5.0 | 2 | 2016 | 11 | 15 |
| 1 | 3267 | 0 | 100.0 | 2 | 2017 | 1 | 11 |
| 2 | 3232 | 0 | 50.0 | 2 | 2016 | 12 | 7 |
| 3 | 1999 | 1 | 1000.0 | 2 | 2017 | 1 | 11 |
| 4 | 1444 | 1 | 200.0 | 2 | 2016 | 11 | 16 |
new_test= test[['user_id', 'payment_method', 'money_transacted', 'partner_pricing_category', 'transaction_year',
'transaction_month','transaction_day']]
new_test.head()
| user_id | payment_method | money_transacted | partner_pricing_category | transaction_year | transaction_month | transaction_day | |
|---|---|---|---|---|---|---|---|
| 0 | 778 | 5 | 200.0 | 2 | 2016 | 12 | 28 |
| 1 | 67 | 0 | 20.0 | 2 | 2016 | 12 | 14 |
| 2 | 2411 | 2 | -100.0 | 2 | 2017 | 1 | 21 |
| 3 | 2230 | 4 | 200.0 | 1 | 2017 | 1 | 16 |
| 4 | 217 | 0 | 100.0 | 2 | 2017 | 2 | 9 |
x_train, x_val, y_train, y_val= train_test_split(new_train, y, test_size=0.25, random_state=1513)
solution= pd.read_csv("solution.csv")
solution.head(2)
| transaction_number | is_fraud | |
|---|---|---|
| 0 | 857050141038272 | 0 |
| 1 | 4400462872603 | 0 |
y_sol= solution['is_fraud']
y_sol.head(2)
0 0 1 0 Name: is_fraud, dtype: int64
from sklearn.tree import DecisionTreeClassifier
dt2=DecisionTreeClassifier()
dt2.fit(x_train, y_train)
y_pred_new = dt2.predict(x_val)
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_val, y_pred_new), columns=['Predicted No','Predicted Yes'], index=['Actual No','Actual Yes'])
| Predicted No | Predicted Yes | |
|---|---|---|
| Actual No | 19082 | 6 |
| Actual Yes | 1 | 44 |
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred_new))
precision recall f1-score support
0 1.00 1.00 1.00 19088
1 0.88 0.98 0.93 45
accuracy 1.00 19133
macro avg 0.94 0.99 0.96 19133
weighted avg 1.00 1.00 1.00 19133
y_test_pred_new = dt2.predict(new_test)
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_sol, y_test_pred_new), columns=['Predicted No','Predicted Yes'], index=['Actual No','Actual Yes'])
| Predicted No | Predicted Yes | |
|---|---|---|
| Actual No | 19091 | 3 |
| Actual Yes | 7 | 32 |
from sklearn.metrics import classification_report
print(classification_report(y_sol, y_test_pred_new))
precision recall f1-score support
0 1.00 1.00 1.00 19094
1 0.91 0.82 0.86 39
accuracy 1.00 19133
macro avg 0.96 0.91 0.93 19133
weighted avg 1.00 1.00 1.00 19133
import matplotlib.pyplot as plt
path = dt2.cost_complexity_pruning_path(new_train, y)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
Text(0.5, 1.0, 'Total Impurity vs effective alpha for training set')
dt2 = DecisionTreeClassifier(ccp_alpha= 0.000015, random_state= 7, max_depth= 7, min_samples_leaf= 8)
dt2.fit(new_train, y)
y_test_pred_new = dt2.predict(new_test)
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_sol, y_test_pred_new), columns=['Predicted No','Predicted Yes'], index=['Actual No','Actual Yes'])
| Predicted No | Predicted Yes | |
|---|---|---|
| Actual No | 19091 | 3 |
| Actual Yes | 4 | 35 |
from sklearn.metrics import classification_report
print(classification_report(y_sol, y_test_pred_new))
precision recall f1-score support
0 1.00 1.00 1.00 19094
1 0.92 0.90 0.91 39
accuracy 1.00 19133
macro avg 0.96 0.95 0.95 19133
weighted avg 1.00 1.00 1.00 19133
from sklearn import metrics
r_square2= metrics.r2_score(y_sol, y_test_pred_new)
print('R Squared error associated with pruned Decision Tree is:', r_square2)
R Squared error associated with pruned Decision Tree is: 0.8201462132016233
from sklearn.metrics import f1_score
print("f1 score: ",(f1_score(solution.is_fraud, y_test_pred_new))*100)
f1 score: 90.9090909090909
Recommendations for IndAvenue based on the visualizations:
• unified_payment_services is the only one where there has been no fraud, however the transactions are aslo the lowest and it remains the most unused. The value of money_transacted is also only upto a few 100 rupees. So UPI can be a go to option for payments of smaller amounts.
• Beyond 25k only visa_master_debit cards seem to have a few transactions that aren't fraud, but that to unless the amount is under 50k.
• The partners like 39445 (19), 71001 (15), 173558 (89) that have less than 100 transactions and yet are resulting in fradulent payments, should likely be discontinued.
• Partners reulting in fraudulent payments: 23667 (19526), 39445 (19), 47334 (26105), 71001 (15), 7889 (2317), 102557(231) ,118335 (9546), 165669 (1216), 173558 (89).
• Only categories 1, 2, 3, and 8 are the ones where frauds have been committed.
•-200 to 400 is the range were there are the lowest frauds so we can bin it as green, -750 to -200 and 420 to 700 have moderate amount of frauds so we can bin it as yellow, -20000.0 to -750 and 700 to 197217.76 are the amounts where almost all the payments are fradulent and so we can bin them as red.
from sklearn import tree
plt.figure(figsize=(40,40))
features = new_train.columns
#classes = ['Not heart disease','heart disease']
tree.plot_tree(dt2,feature_names=features, filled=True)
plt.show(True)
The above visualization will help decision-makers to visualize the big picture of the current situation.
So, that they can understand the specific road where the desired outcome has the highest likelihood while you can also recognize the situations which may end up with undesired consequences.
If-then scenarios can be created using the visualization above, like:
• If the money transacted is <=44,500 and transaction_day > 6.5 then there are frauds committed. Which means that for values <=44,500 the frauds are committed in the later days of the month.
The model used above is a Decision Tree Classifier.
Decision tree Algorithm is a type of supervised learning for classification problems. Classification Is the process of dividing data sets into different categories or groups by adding label (Ex- fraud/ not fraud)
It gives a graphical representation of all the possible solutions to a decision based on certain conditions. These conditions are then used to create the if-then rules to make decisions.
• It builds the model in the form of tree.
• It breaks down a dataset into smaller and smaller subsets as decision tree is incrementally developed.
• Tree has decision node (Attributes) and leaf nodes (final classification label). Root of the decision tree is best predictor node.
• Trees are constructed in a top-down, recursive manner using divide-and-conquer strategy.
For test records, conditions are checked using if-else and record gets classified.
To ensure fastcustomer checkout, the following strategies can be used:
• The vendor can use payloud payment boxes, so that whenever a payment is successfully done it is announced via the system.
• The red, yellow and green paymnet bins should be handeled differently, with red bins (higher amounts transacted) having a separate payment queue and double checking it to ensure complete safety.
• The people paying smaller amounts and belonging to green category should be encouraged to use upi for payments, as they are fast and also have no reported frauds.
• There should be a stable internet connection so that the payments are processed successfully and there is no glitch midway.